Preparation of input files

The input files for this analysis are .mirna format files that are generated through miraligner. PCR duplicates are removed using seqcluster.

Library loading and set up

suppressMessages(
  c(library(isomiRs),
    library(DESeq2),
    library(tximport),
    library(pheatmap),
    library(gridExtra),
    library(grid),
    library(ggplot2),
    library(lattice),
    library(reshape),
    library(mixOmics),
    library(gplots),
    library(RColorBrewer),
    library(readr),
    library(dplyr),
    library(data.table),
    library(tidyverse),
    library(rtracklayer),
    library(Biostrings),
    library(Rsubread),
    library(ggrepel),
    library(CLIPanalyze),
    library(plotly)
   )
)
##   [1] "isomiRs"              "SummarizedExperiment" "DelayedArray"        
##   [4] "matrixStats"          "Biobase"              "GenomicRanges"       
##   [7] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
##  [10] "BiocGenerics"         "parallel"             "stats4"              
##  [13] "DiscriMiner"          "stats"                "graphics"            
##  [16] "grDevices"            "utils"                "datasets"            
##  [19] "methods"              "base"                 "DESeq2"              
##  [22] "isomiRs"              "SummarizedExperiment" "DelayedArray"        
##  [25] "matrixStats"          "Biobase"              "GenomicRanges"       
##  [28] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
##  [31] "BiocGenerics"         "parallel"             "stats4"              
##  [34] "DiscriMiner"          "stats"                "graphics"            
##  [37] "grDevices"            "utils"                "datasets"            
##  [40] "methods"              "base"                 "tximport"            
##  [43] "DESeq2"               "isomiRs"              "SummarizedExperiment"
##  [46] "DelayedArray"         "matrixStats"          "Biobase"             
##  [49] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
##  [52] "S4Vectors"            "BiocGenerics"         "parallel"            
##  [55] "stats4"               "DiscriMiner"          "stats"               
##  [58] "graphics"             "grDevices"            "utils"               
##  [61] "datasets"             "methods"              "base"                
##  [64] "pheatmap"             "tximport"             "DESeq2"              
##  [67] "isomiRs"              "SummarizedExperiment" "DelayedArray"        
##  [70] "matrixStats"          "Biobase"              "GenomicRanges"       
##  [73] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
##  [76] "BiocGenerics"         "parallel"             "stats4"              
##  [79] "DiscriMiner"          "stats"                "graphics"            
##  [82] "grDevices"            "utils"                "datasets"            
##  [85] "methods"              "base"                 "gridExtra"           
##  [88] "pheatmap"             "tximport"             "DESeq2"              
##  [91] "isomiRs"              "SummarizedExperiment" "DelayedArray"        
##  [94] "matrixStats"          "Biobase"              "GenomicRanges"       
##  [97] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [100] "BiocGenerics"         "parallel"             "stats4"              
## [103] "DiscriMiner"          "stats"                "graphics"            
## [106] "grDevices"            "utils"                "datasets"            
## [109] "methods"              "base"                 "grid"                
## [112] "gridExtra"            "pheatmap"             "tximport"            
## [115] "DESeq2"               "isomiRs"              "SummarizedExperiment"
## [118] "DelayedArray"         "matrixStats"          "Biobase"             
## [121] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [124] "S4Vectors"            "BiocGenerics"         "parallel"            
## [127] "stats4"               "DiscriMiner"          "stats"               
## [130] "graphics"             "grDevices"            "utils"               
## [133] "datasets"             "methods"              "base"                
## [136] "ggplot2"              "grid"                 "gridExtra"           
## [139] "pheatmap"             "tximport"             "DESeq2"              
## [142] "isomiRs"              "SummarizedExperiment" "DelayedArray"        
## [145] "matrixStats"          "Biobase"              "GenomicRanges"       
## [148] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [151] "BiocGenerics"         "parallel"             "stats4"              
## [154] "DiscriMiner"          "stats"                "graphics"            
## [157] "grDevices"            "utils"                "datasets"            
## [160] "methods"              "base"                 "lattice"             
## [163] "ggplot2"              "grid"                 "gridExtra"           
## [166] "pheatmap"             "tximport"             "DESeq2"              
## [169] "isomiRs"              "SummarizedExperiment" "DelayedArray"        
## [172] "matrixStats"          "Biobase"              "GenomicRanges"       
## [175] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [178] "BiocGenerics"         "parallel"             "stats4"              
## [181] "DiscriMiner"          "stats"                "graphics"            
## [184] "grDevices"            "utils"                "datasets"            
## [187] "methods"              "base"                 "reshape"             
## [190] "lattice"              "ggplot2"              "grid"                
## [193] "gridExtra"            "pheatmap"             "tximport"            
## [196] "DESeq2"               "isomiRs"              "SummarizedExperiment"
## [199] "DelayedArray"         "matrixStats"          "Biobase"             
## [202] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [205] "S4Vectors"            "BiocGenerics"         "parallel"            
## [208] "stats4"               "DiscriMiner"          "stats"               
## [211] "graphics"             "grDevices"            "utils"               
## [214] "datasets"             "methods"              "base"                
## [217] "mixOmics"             "MASS"                 "reshape"             
## [220] "lattice"              "ggplot2"              "grid"                
## [223] "gridExtra"            "pheatmap"             "tximport"            
## [226] "DESeq2"               "isomiRs"              "SummarizedExperiment"
## [229] "DelayedArray"         "matrixStats"          "Biobase"             
## [232] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [235] "S4Vectors"            "BiocGenerics"         "parallel"            
## [238] "stats4"               "DiscriMiner"          "stats"               
## [241] "graphics"             "grDevices"            "utils"               
## [244] "datasets"             "methods"              "base"                
## [247] "gplots"               "mixOmics"             "MASS"                
## [250] "reshape"              "lattice"              "ggplot2"             
## [253] "grid"                 "gridExtra"            "pheatmap"            
## [256] "tximport"             "DESeq2"               "isomiRs"             
## [259] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
## [262] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [265] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [268] "parallel"             "stats4"               "DiscriMiner"         
## [271] "stats"                "graphics"             "grDevices"           
## [274] "utils"                "datasets"             "methods"             
## [277] "base"                 "RColorBrewer"         "gplots"              
## [280] "mixOmics"             "MASS"                 "reshape"             
## [283] "lattice"              "ggplot2"              "grid"                
## [286] "gridExtra"            "pheatmap"             "tximport"            
## [289] "DESeq2"               "isomiRs"              "SummarizedExperiment"
## [292] "DelayedArray"         "matrixStats"          "Biobase"             
## [295] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [298] "S4Vectors"            "BiocGenerics"         "parallel"            
## [301] "stats4"               "DiscriMiner"          "stats"               
## [304] "graphics"             "grDevices"            "utils"               
## [307] "datasets"             "methods"              "base"                
## [310] "readr"                "RColorBrewer"         "gplots"              
## [313] "mixOmics"             "MASS"                 "reshape"             
## [316] "lattice"              "ggplot2"              "grid"                
## [319] "gridExtra"            "pheatmap"             "tximport"            
## [322] "DESeq2"               "isomiRs"              "SummarizedExperiment"
## [325] "DelayedArray"         "matrixStats"          "Biobase"             
## [328] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [331] "S4Vectors"            "BiocGenerics"         "parallel"            
## [334] "stats4"               "DiscriMiner"          "stats"               
## [337] "graphics"             "grDevices"            "utils"               
## [340] "datasets"             "methods"              "base"                
## [343] "dplyr"                "readr"                "RColorBrewer"        
## [346] "gplots"               "mixOmics"             "MASS"                
## [349] "reshape"              "lattice"              "ggplot2"             
## [352] "grid"                 "gridExtra"            "pheatmap"            
## [355] "tximport"             "DESeq2"               "isomiRs"             
## [358] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
## [361] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [364] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [367] "parallel"             "stats4"               "DiscriMiner"         
## [370] "stats"                "graphics"             "grDevices"           
## [373] "utils"                "datasets"             "methods"             
## [376] "base"                 "data.table"           "dplyr"               
## [379] "readr"                "RColorBrewer"         "gplots"              
## [382] "mixOmics"             "MASS"                 "reshape"             
## [385] "lattice"              "ggplot2"              "grid"                
## [388] "gridExtra"            "pheatmap"             "tximport"            
## [391] "DESeq2"               "isomiRs"              "SummarizedExperiment"
## [394] "DelayedArray"         "matrixStats"          "Biobase"             
## [397] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [400] "S4Vectors"            "BiocGenerics"         "parallel"            
## [403] "stats4"               "DiscriMiner"          "stats"               
## [406] "graphics"             "grDevices"            "utils"               
## [409] "datasets"             "methods"              "base"                
## [412] "forcats"              "stringr"              "purrr"               
## [415] "tidyr"                "tibble"               "tidyverse"           
## [418] "data.table"           "dplyr"                "readr"               
## [421] "RColorBrewer"         "gplots"               "mixOmics"            
## [424] "MASS"                 "reshape"              "lattice"             
## [427] "ggplot2"              "grid"                 "gridExtra"           
## [430] "pheatmap"             "tximport"             "DESeq2"              
## [433] "isomiRs"              "SummarizedExperiment" "DelayedArray"        
## [436] "matrixStats"          "Biobase"              "GenomicRanges"       
## [439] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [442] "BiocGenerics"         "parallel"             "stats4"              
## [445] "DiscriMiner"          "stats"                "graphics"            
## [448] "grDevices"            "utils"                "datasets"            
## [451] "methods"              "base"                 "rtracklayer"         
## [454] "forcats"              "stringr"              "purrr"               
## [457] "tidyr"                "tibble"               "tidyverse"           
## [460] "data.table"           "dplyr"                "readr"               
## [463] "RColorBrewer"         "gplots"               "mixOmics"            
## [466] "MASS"                 "reshape"              "lattice"             
## [469] "ggplot2"              "grid"                 "gridExtra"           
## [472] "pheatmap"             "tximport"             "DESeq2"              
## [475] "isomiRs"              "SummarizedExperiment" "DelayedArray"        
## [478] "matrixStats"          "Biobase"              "GenomicRanges"       
## [481] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [484] "BiocGenerics"         "parallel"             "stats4"              
## [487] "DiscriMiner"          "stats"                "graphics"            
## [490] "grDevices"            "utils"                "datasets"            
## [493] "methods"              "base"                 "Biostrings"          
## [496] "XVector"              "rtracklayer"          "forcats"             
## [499] "stringr"              "purrr"                "tidyr"               
## [502] "tibble"               "tidyverse"            "data.table"          
## [505] "dplyr"                "readr"                "RColorBrewer"        
## [508] "gplots"               "mixOmics"             "MASS"                
## [511] "reshape"              "lattice"              "ggplot2"             
## [514] "grid"                 "gridExtra"            "pheatmap"            
## [517] "tximport"             "DESeq2"               "isomiRs"             
## [520] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
## [523] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [526] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [529] "parallel"             "stats4"               "DiscriMiner"         
## [532] "stats"                "graphics"             "grDevices"           
## [535] "utils"                "datasets"             "methods"             
## [538] "base"                 "Rsubread"             "Biostrings"          
## [541] "XVector"              "rtracklayer"          "forcats"             
## [544] "stringr"              "purrr"                "tidyr"               
## [547] "tibble"               "tidyverse"            "data.table"          
## [550] "dplyr"                "readr"                "RColorBrewer"        
## [553] "gplots"               "mixOmics"             "MASS"                
## [556] "reshape"              "lattice"              "ggplot2"             
## [559] "grid"                 "gridExtra"            "pheatmap"            
## [562] "tximport"             "DESeq2"               "isomiRs"             
## [565] "SummarizedExperiment" "DelayedArray"         "matrixStats"         
## [568] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [571] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [574] "parallel"             "stats4"               "DiscriMiner"         
## [577] "stats"                "graphics"             "grDevices"           
## [580] "utils"                "datasets"             "methods"             
## [583] "base"                 "ggrepel"              "Rsubread"            
## [586] "Biostrings"           "XVector"              "rtracklayer"         
## [589] "forcats"              "stringr"              "purrr"               
## [592] "tidyr"                "tibble"               "tidyverse"           
## [595] "data.table"           "dplyr"                "readr"               
## [598] "RColorBrewer"         "gplots"               "mixOmics"            
## [601] "MASS"                 "reshape"              "lattice"             
## [604] "ggplot2"              "grid"                 "gridExtra"           
## [607] "pheatmap"             "tximport"             "DESeq2"              
## [610] "isomiRs"              "SummarizedExperiment" "DelayedArray"        
## [613] "matrixStats"          "Biobase"              "GenomicRanges"       
## [616] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [619] "BiocGenerics"         "parallel"             "stats4"              
## [622] "DiscriMiner"          "stats"                "graphics"            
## [625] "grDevices"            "utils"                "datasets"            
## [628] "methods"              "base"                 "CLIPanalyze"         
## [631] "GenomicAlignments"    "Rsamtools"            "GenomicFeatures"     
## [634] "AnnotationDbi"        "plyr"                 "ggrepel"             
## [637] "Rsubread"             "Biostrings"           "XVector"             
## [640] "rtracklayer"          "forcats"              "stringr"             
## [643] "purrr"                "tidyr"                "tibble"              
## [646] "tidyverse"            "data.table"           "dplyr"               
## [649] "readr"                "RColorBrewer"         "gplots"              
## [652] "mixOmics"             "MASS"                 "reshape"             
## [655] "lattice"              "ggplot2"              "grid"                
## [658] "gridExtra"            "pheatmap"             "tximport"            
## [661] "DESeq2"               "isomiRs"              "SummarizedExperiment"
## [664] "DelayedArray"         "matrixStats"          "Biobase"             
## [667] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [670] "S4Vectors"            "BiocGenerics"         "parallel"            
## [673] "stats4"               "DiscriMiner"          "stats"               
## [676] "graphics"             "grDevices"            "utils"               
## [679] "datasets"             "methods"              "base"                
## [682] "plotly"               "CLIPanalyze"          "GenomicAlignments"   
## [685] "Rsamtools"            "GenomicFeatures"      "AnnotationDbi"       
## [688] "plyr"                 "ggrepel"              "Rsubread"            
## [691] "Biostrings"           "XVector"              "rtracklayer"         
## [694] "forcats"              "stringr"              "purrr"               
## [697] "tidyr"                "tibble"               "tidyverse"           
## [700] "data.table"           "dplyr"                "readr"               
## [703] "RColorBrewer"         "gplots"               "mixOmics"            
## [706] "MASS"                 "reshape"              "lattice"             
## [709] "ggplot2"              "grid"                 "gridExtra"           
## [712] "pheatmap"             "tximport"             "DESeq2"              
## [715] "isomiRs"              "SummarizedExperiment" "DelayedArray"        
## [718] "matrixStats"          "Biobase"              "GenomicRanges"       
## [721] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [724] "BiocGenerics"         "parallel"             "stats4"              
## [727] "DiscriMiner"          "stats"                "graphics"            
## [730] "grDevices"            "utils"                "datasets"            
## [733] "methods"              "base"

Initial analysis

Compile input files into isomiRs

Set working directory to the folder that contains only gene count .mirna files

setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/miRNA_Analysis/Gene Counts/HEAP-CLIP_09282019_miRNA")
inDir = getwd()
mirFiles = list.files(inDir, pattern=".mirna$", full.names = TRUE)
basename(mirFiles)
## [1] "HVA4_miRNA_LIB043893_GEN00166438_R1_barcode.cutadapt.mirna" 
## [2] "HVA5_miRNA_LIB043893_GEN00166441_R1_barcode.cutadapt.mirna" 
## [3] "HVA6_miRNA_LIB043893_GEN00166444_R1_barcode.cutadapt.mirna" 
## [4] "HVAK4_miRNA_LIB043893_GEN00166447_R1_barcode.cutadapt.mirna"
## [5] "HVAK5_miRNA_LIB043893_GEN00166450_R1_barcode.cutadapt.mirna"
## [6] "HVAK6_miRNA_LIB043893_GEN00166453_R1_barcode.cutadapt.mirna"
isomiRsampletable <- data.frame(
  row.names = c('HVA_4_miRNA','HVA_5_miRNA','HVA_6_miRNA','HVAK_4_miRNA','HVAK_5_miRNA', 'HVAK_6_miRNA'),
  condition = c('control','control','control','experimental','experimental', 'experimental'),
  libType = c('single-end','single-end','single-end','single-end','single-end','single-end')
)

ids <- IsomirDataSeqFromFiles(mirFiles,
                              coldata = isomiRsampletable,
                              design = ~ condition)

miRNA general abundance and count matrix

dir.create("PDF_figure", showWarnings = FALSE)
isoPlot(ids, type="all")
## Ussing 905 isomiRs.

pdf("PDF_figure/isoPlot.pdf",
    width = 8,
    height = 6)
isoPlot(ids, type="all")
## Ussing 905 isomiRs.
dev.off()
## quartz_off_screen 
##                 2
# overview of the counts
head(counts(ids))
##               HVA_4_miRNA HVA_5_miRNA HVA_6_miRNA HVAK_4_miRNA HVAK_5_miRNA
## mmu-let-7a-5p        1864         303        1510         2418         2010
## mmu-let-7b-3p           5           9           7            5            7
## mmu-let-7b-5p        5724        6348        5986         6298         5846
## mmu-let-7c-5p        3595        3365        3612         3721         3583
## mmu-let-7d-3p         239         597         504          349          321
## mmu-let-7d-5p         531         152         489          588          569
##               HVAK_6_miRNA
## mmu-let-7a-5p         2096
## mmu-let-7b-3p            3
## mmu-let-7b-5p         5955
## mmu-let-7c-5p         3775
## mmu-let-7d-3p          343
## mmu-let-7d-5p          544
# output the count matrix
ids = isoNorm(ids, formula = ~ condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
rawCountTable <- as.data.frame(counts(ids, norm = TRUE))

# normalization is doing using rlog from DESeq2
pheatmap(counts(ids, norm=TRUE), 
         annotation_col = data.frame(colData(ids)[,1,drop=FALSE]),
         show_rownames = FALSE, scale="row")

pdf("PDF_figure/Pheatmap.pdf",
    width = 5,
    height = 4)
pheatmap(counts(ids, norm=TRUE), 
         annotation_col = data.frame(colData(ids)[,1,drop=FALSE]),
         show_rownames = FALSE, scale="row")
dev.off()
## pdf 
##   3

Differnetial Analysis

# Perform differential analysis
dds_analysis <- isoDE(ids, formula = ~condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# Get a DESeq object using the count matrix
dds <- DESeqDataSetFromMatrix(counts(ids),
                             colData(ids), design = ~ condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_norm <- estimateSizeFactors(dds)

# transform the dataset for MA plot and PCA plotting
dds_transform <- varianceStabilizingTransformation(dds)
plotMA(dds_analysis)

pdf("PDF_figure/QC_MAPlot.pdf",
    width = 5,
    height = 4)
plotMA(dds_analysis)
dev.off()
## quartz_off_screen 
##                 2
plotPCA(dds_transform) +   
  geom_text(aes(label=name), vjust = 2, hjust = -0.1) +
  xlim(-12,30) +ylim(-20,15)

pdf("PDF_figure/QC_PCAPlot.pdf",
    width = 6,
    height = 4)
plotPCA(dds_transform) +   
  geom_text(aes(label=name), vjust = 2, hjust = -0.1) +
  xlim(-12,30) +ylim(-20,15)
dev.off()
## quartz_off_screen 
##                 2
#Boxplots
pseudoCount = log2(rawCountTable + 1)
suppressMessages(df <- melt(pseudoCount, variable_name = "Samples"))
## Warning in melt(pseudoCount, variable_name = "Samples"): The melt generic in
## data.table has been passed a data.frame and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace like reshape2::melt(pseudoCount). In the next version, this warning
## will become an error.
df <- data.frame(df, Condition = substr(df$variable,1,4))

ggplot(df, aes(x=variable, y=value, fill = Condition)) + geom_boxplot() + xlab("") + 
  ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

pdf("PDF_figure/QC_BoxPlot.pdf",
    width = 5,
    height = 4)
ggplot(df, aes(x=variable, y=value, fill = Condition)) + geom_boxplot() + xlab("") + 
  ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
dev.off()
## quartz_off_screen 
##                 2

There is good seperation between HVA and HVAK samples. I can proceed to mix the miRNA datasets from HEAP-CLIP_052119 and HEAP-CLIP_09282019.

Combined analysis

Re-compile filtered input files into isomiRs

Set working directory to the folder that contains only gene count .mirna files

setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/miRNA_Analysis/Gene Counts")
inDir = getwd()
mirFiles = list.files(inDir, pattern=".mirna$", full.names = TRUE)
basename(mirFiles)
##  [1] "HVA1_miRNA_LIB041590_GEN00154970_R1_barcode.cutadapt.mirna" 
##  [2] "HVA2_miRNA_LIB041590_GEN00154973_R1_barcode.cutadapt.mirna" 
##  [3] "HVA3_miRNA_LIB041590_GEN00154976_R1_barcode.cutadapt.mirna" 
##  [4] "HVA4_miRNA_LIB043893_GEN00166438_R1_barcode.cutadapt.mirna" 
##  [5] "HVA5_miRNA_LIB043893_GEN00166441_R1_barcode.cutadapt.mirna" 
##  [6] "HVA6_miRNA_LIB043893_GEN00166444_R1_barcode.cutadapt.mirna" 
##  [7] "HVAK1_miRNA_LIB041590_GEN00154979_R1_barcode.cutadapt.mirna"
##  [8] "HVAK2_miRNA_LIB041590_GEN00154982_R1_barcode.cutadapt.mirna"
##  [9] "HVAK3_miRNA_LIB041590_GEN00154985_R1_barcode.cutadapt.mirna"
## [10] "HVAK4_miRNA_LIB043893_GEN00166447_R1_barcode.cutadapt.mirna"
## [11] "HVAK5_miRNA_LIB043893_GEN00166450_R1_barcode.cutadapt.mirna"
## [12] "HVAK6_miRNA_LIB043893_GEN00166453_R1_barcode.cutadapt.mirna"
isomiRsampletable <- data.frame(
  row.names = c('HVA_1_miR','HVA_2_miR','HVA_3_miR','HVA_4_miR','HVA_5_miR','HVA_6_miR','HVAK_1_miR','HVAK_2_miR','HVAK_3_miR','HVAK_4_miR','HVAK_5_miR','HVAK_6_miR'),
  condition = c('control','control','control','control','control','control','experimental', 'experimental', 'experimental','experimental', 'experimental', 'experimental'),
  libType = c('single-end','single-end','single-end','single-end','single-end','single-end','single-end','single-end','single-end','single-end','single-end','single-end'),
  batch = c('1','1','1','2','2','2','1','1','1','2','2','2')
)

ids <- IsomirDataSeqFromFiles(mirFiles,
                              coldata = isomiRsampletable,
                              design = ~ batch + condition)

miRNA general abundance and count matrix

isoPlot(ids, type="all")
## Ussing 987 isomiRs.

pdf("PDF_figure/isoPlot_merged.pdf",
    width = 8,
    height = 6)
isoPlot(ids, type="all")
## Ussing 987 isomiRs.
dev.off()
## quartz_off_screen 
##                 2
# overview of the counts
head(counts(ids))
##               HVA_1_miR HVA_2_miR HVA_3_miR HVA_4_miR HVA_5_miR HVA_6_miR
## mmu-let-7a-5p      1664       881      1036      1864       303      1510
## mmu-let-7b-3p         8         7         8         5         9         7
## mmu-let-7b-5p      4388      3056      3177      5724      6348      5986
## mmu-let-7c-5p      2577      2105      2226      3595      3365      3612
## mmu-let-7d-3p       214       166       204       239       597       504
## mmu-let-7d-5p       506       306       425       531       152       489
##               HVAK_1_miR HVAK_2_miR HVAK_3_miR HVAK_4_miR HVAK_5_miR HVAK_6_miR
## mmu-let-7a-5p       1445       1226       1258       2418       2010       2096
## mmu-let-7b-3p          6          8         12          5          7          3
## mmu-let-7b-5p       3451       3678       3555       6298       5846       5955
## mmu-let-7c-5p       2366       2433       2238       3721       3583       3775
## mmu-let-7d-3p        174        194        183        349        321        343
## mmu-let-7d-5p        380        393        362        588        569        544
# output the count matrix
rawCountTable <- as.data.frame(counts(ids))

Differnetial Analysis using the DESeq2 package

Establish DESeq DataSet

# Get a DESeq object using the count matrix
dds <- DESeqDataSetFromMatrix(rawCountTable,
                             colData(ids), design = ~ batch + condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_analysis <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

MA plot was generated.

plotMA(dds_analysis)

pdf("PDF_figure/QC_MAPlot_merged.pdf",
    width = 5,
    height = 4)
plotMA(dds_analysis)
dev.off()
## quartz_off_screen 
##                 2

Quality Inspection of the Gene Count Data

Generate raw count table that contains the simple counts of each gene

Data is transformed and pseudocount is calculated.

dds_norm <- estimateSizeFactors(dds)
rawCountTable <- as.data.frame(counts(dds_norm, normalized = TRUE))
rawCountTable_no_norm <- as.data.frame(counts(dds_norm))
pseudoCount = log2(rawCountTable + 1)
grid.arrange(
  ggplot(pseudoCount, aes(x= HVA_1_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_1_miR"), 
  ggplot(pseudoCount, aes(x= HVA_2_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_2_miR"),
  ggplot(pseudoCount, aes(x= HVA_3_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_3_miR"),
  ggplot(pseudoCount, aes(x= HVA_4_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_4_miR"), 
  ggplot(pseudoCount, aes(x= HVA_5_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_5_miR"),
  ggplot(pseudoCount, aes(x= HVA_6_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_6_miR"),
  ggplot(pseudoCount, aes(x= HVAK_1_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_1_miR"),
  ggplot(pseudoCount, aes(x= HVAK_2_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_2_miR"),
  ggplot(pseudoCount, aes(x= HVAK_3_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_3_miR"), 
  ggplot(pseudoCount, aes(x= HVAK_4_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_4_miR"),
  ggplot(pseudoCount, aes(x= HVAK_5_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_5_miR"),
  ggplot(pseudoCount, aes(x= HVAK_6_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_6_miR"), 
  nrow = 3)

pdf("PDF_figure/QC_histogram.pdf",
    width = 10,
    height = 8)
grid.arrange(
  ggplot(pseudoCount, aes(x= HVA_1_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_1_miR"), 
  ggplot(pseudoCount, aes(x= HVA_2_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_2_miR"),
  ggplot(pseudoCount, aes(x= HVA_3_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_3_miR"),
  ggplot(pseudoCount, aes(x= HVA_4_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_4_miR"), 
  ggplot(pseudoCount, aes(x= HVA_5_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_5_miR"),
  ggplot(pseudoCount, aes(x= HVA_6_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_6_miR"),
  ggplot(pseudoCount, aes(x= HVAK_1_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_1_miR"),
  ggplot(pseudoCount, aes(x= HVAK_2_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_2_miR"),
  ggplot(pseudoCount, aes(x= HVAK_3_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_3_miR"), 
  ggplot(pseudoCount, aes(x= HVAK_4_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_4_miR"),
  ggplot(pseudoCount, aes(x= HVAK_5_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_5_miR"),
  ggplot(pseudoCount, aes(x= HVAK_6_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_6_miR"), 
  nrow = 3)
dev.off()
## quartz_off_screen 
##                 2
Between-sample distribution

Check on the gene count distribution across all genes.

#Boxplots
suppressMessages(df <- melt(pseudoCount, variable_name = "Samples"))
## Warning in melt(pseudoCount, variable_name = "Samples"): The melt generic in
## data.table has been passed a data.frame and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace like reshape2::melt(pseudoCount). In the next version, this warning
## will become an error.
df <- data.frame(df, Condition = substr(df$variable,1,4))

ggplot(df, aes(x=variable, y=value, fill = Condition)) + geom_boxplot() + xlab("") + 
  ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

pdf("PDF_figure/QC_BoxPlot_merged.pdf",
    width = 5,
    height = 4)
ggplot(df, aes(x=variable, y=value, fill = Condition)) + geom_boxplot() + xlab("") + 
  ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
dev.off()
## quartz_off_screen 
##                 2
#Histograms and density plots
ggplot(df, aes(x=value, colour = variable, fill = variable)) + ylim(c(0, 0.25)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab(expression(log[2](count+1)))

pdf("PDF_figure/QC_densityPlot_merged.pdf",
    width = 8,
    height = 6)
ggplot(df, aes(x=value, colour = variable, fill = variable)) + ylim(c(0, 0.25)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab(expression(log[2](count+1)))
dev.off()
## quartz_off_screen 
##                 2
Generate MA plots

MA plots are used to check for imbalance in sequencing depth between samples of the same condition. I did not generate MA plot for all library pairs. But the example pairs I selected show that there are imbalance in sequencing depth, but the imbalance is quite random and this is common in RNA-Seq datasets.

## HVA1 vs HVA2
x = pseudoCount[, 1]
y = pseudoCount[, 2]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)

suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
  geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "HVA_1_miR vs HVA_2_miR"))
## `geom_smooth()` using formula 'y ~ x'

## HVA1 vs HVA3
x = pseudoCount[, 1]
y = pseudoCount[, 3]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)

suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
  geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "HVA_1_miR vs HVA_3_miR"))
## `geom_smooth()` using formula 'y ~ x'

## HVA1 vs HVAK1
x = pseudoCount[, 1]
y = pseudoCount[, 4]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)

suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
  geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "HVA_1_miR vs HVAK_1_miR"))
## `geom_smooth()` using formula 'y ~ x'

## HVA1 vs HVAK2
x = pseudoCount[, 1]
y = pseudoCount[, 5]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)

suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
  geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "HVA_1_miR vs HVAK_2_miR"))
## `geom_smooth()` using formula 'y ~ x'

## HVA1 vs HVAK3
x = pseudoCount[, 1]
y = pseudoCount[, 6]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)

suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
  geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "HVA_1_miR vs HVAK_3_miR"))
## `geom_smooth()` using formula 'y ~ x'

##### Clustering of the sample-to-sample distances This is the sanity check step to confirm that under a un-supervised clustering, HF and HFK samples will cluster together. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pnf

dds_transform <- varianceStabilizingTransformation(dds)
rawCountTable_transform <- as.data.frame(assay(dds_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
pdf("PDF_figure/Hierchical_Clustering.pdf",
    width = 6,
    height = 6)
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
dev.off()
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(dds_transform, intgroup = "condition", ntop = 500) +   
  geom_text(aes(label=name), vjust = 2, hjust = -0.1) + xlim(-7,20) + ylim(-10,10)

pdf("PDF_figure/QC_PCAPlot_merged.pdf",
    width = 6,
    height = 4)
plotPCA(dds_transform, intgroup = "condition", ntop = 500) +   
  geom_text(aes(label=name), vjust = 2, hjust = -0.1) + xlim(-7,20) + ylim(-10,10)
dev.off()
## quartz_off_screen 
##                 2
Outout gene counts and DE analysis result
dds_result <- results(dds_analysis, contrast = c("condition", "experimental", "control"))
dds_result <- lfcShrink(dds_analysis, contrast = c("condition", "experimental", "control"), res = dds_result, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
write.csv(rawCountTable, "miRNA_counts.csv")
write.csv(as.data.frame(dds_result), "miRNA_de.csv")
Draw heatmap for transcripts that are significantly dysregulated in KRasG12D samples

Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

suppressMessages(library(mosaic))

dif_analysis <- as.data.frame(results(dds_analysis))
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.05)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(pseudoCount)))
}
sig_count <- pseudoCount[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:18] <- zscore(as.numeric(sig_dif[i,7:18]))
}

my_palette <- colorRampPalette(c("blue", "white", "red"))(256)
heatmap_matrix <- as.matrix(sig_dif[,7:18])

png('HVAK vs HVA microRNASeq.png',
    width = 600,
    height = 1000,
    res = 100,
    pointsize = 8)
par(cex.main=1.1)
heatmap.2(heatmap_matrix,
          main = "Differentially expressed\nmiRNA in colon tumor\npadj < 0.05",
          density.info = "none",
          key = TRUE,
          lwid = c(3,7),
          lhei = c(1,7),
          labRow = rownames(heatmap_matrix),
          col=my_palette,
          margins = c(12,11),
          symbreaks = TRUE,
          trace = "none",
          dendrogram = "row",
          cexCol = 2,
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf("PDF_figure/Heatmap_merged.pdf",
    width = 7,
    height = 10)
par(cex.main=1.1)
heatmap.2(heatmap_matrix,
          main = "Differentially expressed\nmiRNA in colon tumor\npadj < 0.05",
          density.info = "none",
          key = TRUE,
          lwid = c(3,7),
          lhei = c(1,7),
          labRow = rownames(heatmap_matrix),
          col=my_palette,
          margins = c(12,11),
          symbreaks = TRUE,
          trace = "none",
          dendrogram = "row",
          cexCol = 2,
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for differential genes

Scatter plot, MA plot and Volcano plot for data visualization
# Scatter plot
dif_analysis$KrasG12D_mean <- rowMeans(pseudoCount[,7:12])
dif_analysis$KrasWT_mean <- rowMeans(pseudoCount[,1:6])
ggplot(dif_analysis, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
  xlab("HVA_Average(log2)") + ylab("HVAK_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "HVA vs HVAK Scatter Plot")

pdf("PDF_figure/ScatterPlot_merged.pdf",
    width = 5,
    height = 4)
ggplot(dif_analysis, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
  xlab("HVA_Average(log2)") + ylab("HVAK_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "HVA vs HVAK Scatter Plot")
dev.off()
## quartz_off_screen 
##                 2
# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "HVA vs HVAK MA Plot") + ylim(-4,4)
## Warning: Removed 1 rows containing missing values (geom_point).

pdf("PDF_figure/MAPlot_merged.pdf",
    width = 5,
    height = 4)
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "HVA vs HVAK MA Plot") + ylim(-4,4)
## Warning: Removed 1 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen 
##                 2
# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(padj,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "HVA vs HVAK Volcano Plot") + xlim(-4,4)
## Warning: Removed 164 rows containing missing values (geom_point).

pdf("PDF_figure/VolcanoPlot_merged.pdf",
    width = 5,
    height = 4)
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(padj,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "HVA vs HVAK Volcano Plot") + xlim(-4,4)
## Warning: Removed 164 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen 
##                 2

Prepare miRNA family level data for analysis with peaks

Load miRNA database

mirna.info <- fread("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/miRNA_Analysis/Analysis/miR_Family_Info.txt", check.names = TRUE)
mirna.info <- mirna.info[Species.ID == 10090]
mirna.info[MiRBase.ID == "mmu-miR-124-3p.1", MiRBase.ID := "mmu-miR-124-3p"]
mirna.info[miR.family == "miR-124-3p.1", miR.family := "miR-124-3p"]
mirna.info[, seed := gsub("U", "T", Seed.m8)]

Load miRNA counts.

mirna.counts <- rawCountTable_no_norm
mirna.counts$miRNA <- rownames(mirna.counts)
mirna.counts <- as.data.table(mirna.counts)

mirna.counts$count <-
    rowSums(mirna.counts[,-13])
mirna.counts$count.hva <-
    rowSums(mirna.counts[, c("HVA_1_miR", "HVA_2_miR", "HVA_3_miR", "HVA_4_miR", "HVA_5_miR", "HVA_6_miR")])
mirna.counts$count.hvak <-
    rowSums(mirna.counts[, c("HVAK_1_miR", "HVAK_2_miR", "HVAK_3_miR", "HVAK_4_miR", "HVAK_5_miR", "HVAK_6_miR")])
mirna.counts <- mirna.counts[order(-count)]
mirna.counts[, miRNA.shortname := miRNA]
which.to.change <- startsWith(mirna.counts$miRNA.shortname, "mmu-")
mirna.counts[which.to.change, ]$miRNA.shortname <-
    sapply(strsplit(mirna.counts[which.to.change, ]$miRNA.shortname, "mmu-"),
           "[", 2)
which.to.change <- endsWith(mirna.counts$miRNA.shortname, "-5p")
mirna.counts[which.to.change]$miRNA.shortname <-
    sapply(strsplit(mirna.counts[which.to.change]$miRNA.shortname, "-5p"),
           "[", 1)
which.to.change <- endsWith(mirna.counts$miRNA.shortname, "-3p")
mirna.counts[which.to.change]$miRNA.shortname <-
    sapply(strsplit(mirna.counts[which.to.change]$miRNA.shortname, "-3p"),
           "[", 1)
dds <- dds[rowSums(DESeq2::counts(dds)) > 200]
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotMA(dds)

mir.dif <- as.data.frame(results(dds, alpha = 0.05, contrast = c("condition", "experimental", "control")))
mir.dif$logP <- -1 * log10(mir.dif$padj)
mir.dif$miRNA <- rownames(mir.dif)
mirna.counts.DGE <- merge(mirna.counts, mir.dif, by = "miRNA", all = TRUE)
setwd("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/CLIP_Analysis/Data Visualization/Datafiles")
saveRDS(mirna.counts.DGE, "mirna-counts-deseq-09282019.rds")
mirna.info <- merge(mirna.info, mirna.counts[, .(MiRBase.ID = miRNA,
                                                 HVA_1_miR, HVA_2_miR, HVA_3_miR, HVA_4_miR, HVA_5_miR,
                                                 HVA_6_miR, HVAK_1_miR, HVAK_2_miR, HVAK_3_miR, HVAK_4_miR,
                                                 HVAK_5_miR, HVAK_6_miR)],
                    by = "MiRBase.ID", all.x = TRUE)

mirna.info[, HVA_1_miR := ifelse(is.na(HVA_1_miR), 0, HVA_1_miR)]
mirna.info[, HVA_2_miR := ifelse(is.na(HVA_2_miR), 0, HVA_2_miR)]
mirna.info[, HVA_3_miR := ifelse(is.na(HVA_3_miR), 0, HVA_3_miR)]
mirna.info[, HVA_4_miR := ifelse(is.na(HVA_4_miR), 0, HVA_4_miR)]
mirna.info[, HVA_5_miR := ifelse(is.na(HVA_5_miR), 0, HVA_5_miR)]
mirna.info[, HVA_6_miR := ifelse(is.na(HVA_6_miR), 0, HVA_6_miR)]
mirna.info[, HVAK_1_miR := ifelse(is.na(HVAK_1_miR), 0, HVAK_1_miR)]
mirna.info[, HVAK_2_miR := ifelse(is.na(HVAK_2_miR), 0, HVAK_2_miR)]
mirna.info[, HVAK_3_miR := ifelse(is.na(HVAK_3_miR), 0, HVAK_3_miR)]
mirna.info[, HVAK_4_miR := ifelse(is.na(HVAK_4_miR), 0, HVAK_4_miR)]
mirna.info[, HVAK_5_miR := ifelse(is.na(HVAK_5_miR), 0, HVAK_5_miR)]
mirna.info[, HVAK_6_miR := ifelse(is.na(HVAK_6_miR), 0, HVAK_6_miR)]


mirna.info[, HVA_1_family := sum(HVA_1_miR),
           by = miR.family]
mirna.info[, HVA_2_family := sum(HVA_2_miR),
           by = miR.family]
mirna.info[, HVA_3_family := sum(HVA_3_miR),
           by = miR.family]
mirna.info[, HVA_4_family := sum(HVA_4_miR),
           by = miR.family]
mirna.info[, HVA_5_family := sum(HVA_5_miR),
           by = miR.family]
mirna.info[, HVA_6_family := sum(HVA_6_miR),
           by = miR.family]
mirna.info[, HVAK_1_family := sum(HVAK_1_miR),
           by = miR.family]
mirna.info[, HVAK_2_family := sum(HVAK_2_miR),
           by = miR.family]
mirna.info[, HVAK_3_family := sum(HVAK_3_miR),
           by = miR.family]
mirna.info[, HVAK_4_family := sum(HVAK_4_miR),
           by = miR.family]
mirna.info[, HVAK_5_family := sum(HVAK_5_miR),
           by = miR.family]
mirna.info[, HVAK_6_family := sum(HVAK_6_miR),
           by = miR.family]

mirna.counts.family <- mirna.info[, .(miR.family, Seed.m8, Family.Conservation., 
                                      HVA_1_family, HVA_2_family, HVA_3_family, HVA_4_family, HVA_5_family,
                                      HVA_6_family, HVAK_1_family, HVAK_2_family, HVAK_3_family, HVAK_4_family,
                                      HVAK_5_family, HVAK_6_family)]
mirna.counts.family <- mirna.counts.family[!duplicated(mirna.counts.family)]
setwd("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/CLIP_Analysis/Data Visualization/Datafiles")
saveRDS(mirna.counts.family, "mirna-counts-by-family-09282019.rds")
dds.family <- DESeqDataSetFromMatrix(mirna.counts.family[,4:15],
                              colData = data.frame(row.names = colnames(mirna.counts.family[,4:15]),
                                                   condition = c(rep("HVA",6), rep("HVAK", 6)),
                                                   batch = c('1','1','1','2','2','2','1','1','1','2','2','2')),
                              design = ~ batch + condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
rownames(dds.family) <- mirna.counts.family$miR.family
dds.family <- dds.family[rowSums(DESeq2::counts(dds.family))>200]
dds.family <- DESeq(dds.family) 
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
mirna.family.res <- results(dds.family, alpha = 0.05, contrast = c("condition", "HVAK", "HVA"))
mirna.family.DGE <- as.data.frame(mirna.family.res)
mirna.family.DGE$miR.family <- rownames(mirna.family.DGE)
mirna.family.DGE <- mirna.family.DGE[order(-mirna.family.DGE$log2FoldChange),]
setwd("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/CLIP_Analysis/Data Visualization/Datafiles")
saveRDS(mirna.family.DGE, "mirna-counts-deseq-by-family-09282019.rds")
cal.z.score <- function(x){
  (x - mean(x)) / sd(x)
}

mirna.counts.family.norm <- DESeq2::counts(dds.family, normalized = TRUE)
mirna.counts.family.norm.log <- log2(mirna.counts.family.norm + 1)
mirna.counts.family.norm.z <- as.data.frame(t(apply(mirna.counts.family.norm.log, 1, cal.z.score)))
select <- subset(mirna.family.DGE, baseMean > 2000)
select <- select$miR.family
pheatmap(mirna.counts.family.norm.z[select,], 
         cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE,
         border_color = NA, fontsize_row = 8)

pdf("PDF_figure/Pheatmap_family.pdf",
    width = 8,
    height = 5)
pheatmap(mirna.counts.family.norm.z[select,], 
         cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE,
         border_color = NA, fontsize_row = 8)
dev.off()
## pdf 
##   3
df <- mirna.family.DGE
df$logP <- -log10(mirna.family.DGE$padj)
p3 <- ggplot(data = df, aes(x = log2FoldChange, y = logP, label = miR.family, size = baseMean)) +
  geom_point(alpha = 0.6, colour = "#EC469A", shape = 1) + 
  #geom_point(data = family.highlight, aes(x = log2FoldChange, y = logP), colour = my.colors[1]) + 
  #geom_point(data = df[mir.highlight[1],], aes(x = log2FoldChange, y = logP), colour = my.colors[9], size = dotsize) +
  #geom_point(data = df[df$Row.names %in% mir.highlight[2:6],], aes(x = log2FoldChange, y = logP), colour = my.colors[4], size = dotsize) +
  #geom_point(data = df[mir.highlight[7],], aes(x = log2FoldChange, y = logP), colour = my.colors[2], size = dotsize) +
  #geom_point(data = df[mir.highlight[8:11],], aes(x = log2FoldChange, y = logP), colour = my.colors[4], size = dotsize) +
  #geom_point(data = df[df$Row.names %in% mir.highlight[12:13],], aes(x = log2FoldChange, y = logP), colour = my.colors[6], size = dotsize) +
  #geom_point(data = df[mir.highlight[13],], aes(x = log2FoldChange, y = logP), colour = my.colors[4], size = dotsize) +
  xlab("Fold change log2 (HVAK / HVA)") +
  ylab("-log10(FDR)") +
  theme_bw() +
  theme(panel.border = element_blank(),
    panel.background = element_blank(),
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    axis.title.x = element_text(size=12, margin = margin(t = 10)),
    axis.title.y = element_text(size=12, margin = margin(r = 10)),
    axis.text = element_text(size=10),
    axis.line.y = element_line(size = 0.5),
    axis.line.x = element_line(size = 0.5),
    axis.ticks.x = element_line(size = 0),
    axis.ticks.y = element_line(size = 0.5),
    plot.title = element_text(hjust = 0.5, size = 12, face = "bold")) + xlim(-4,3)
p4<- ggplotly(p3)
p4
pdf("PDF_figure/VolcanoPlot_expression.pdf",
    width = 5,
    height = 4)
p3
## Warning: Removed 2 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen 
##                 2